This file accompanies the Vizgen Analysis Guides: Cell Type Annotation.
Here we will look at the implementation of an open source package SingleR to assist in cell type annotation.
knitr::opts_chunk$set(echo = TRUE)
library(Seurat) # to run normalizaiton methods
# BiocManager::install('SingleR')
# install.packages('matrixStats')
# BiocManager::install('scran')
library(scran)## Warning: package 'matrixStats' was built under R version 4.3.1
## Warning: package 'IRanges' was built under R version 4.3.1
## Warning: package 'GenomeInfoDb' was built under R version 4.3.1
library(SingleR)
library(data.table) # data processing library
library(ggplot2) # package to make visualizaitons
# BiocManager::install('scRNAseq')
library(scRNAseq) # package to access reference scRNA dataHere we are going to use publicaly available data from a MERSCOPE mouse brain sample.
For our purposes we only need cell_by_gene.csv and cell_metadata.csv files.
md <- fread('https://vzg-vzg-showcase-data.s3.amazonaws.com/Mouse_Brain_Showcase_232/Slice-1_Replicate-1/cell_metadata.csv',
header = T, data.table = FALSE)
counts <- fread('https://vzg-vzg-showcase-data.s3.amazonaws.com/Mouse_Brain_Showcase_232/Slice-1_Replicate-1/cell_by_gene.csv',
header = T, data.table = FALSE)
rownames(md) <- md$EntityID
rownames(counts) <- counts$cell
counts <- counts[,-1] # removing cells as a column so we have a data frame of only counts
blanks <- counts[,grepl('Blank', colnames(counts))]
counts <- counts[,!grepl('Blank', colnames(counts))] # removing blanks
counts <- t(counts)
# counts[1:5,1:5]
# table(rownames(md) == colnames(counts)) # making sure the data frames alignggplot(md, aes(x = center_x, y = -center_y)) +
geom_point(size = .1) + theme_bw()Here we are going to use the Chen et al. 2017 mouse brain.
Below is a table of the cell type labels present in the data and the number of cells they have for each type.
ref <- ChenBrainData()## see ?scRNAseq and browseVignettes('scRNAseq') for documentation
## loading from cache
## see ?scRNAseq and browseVignettes('scRNAseq') for documentation
## loading from cache
table(ref$SVM_clusterID, exclude = F)##
## Astro Ependy Epith1 Epith2 GABA1 GABA10 GABA11 GABA12 GABA13 GABA14
## 1148 413 818 379 23 23 35 62 165 71
## GABA15 GABA16 GABA17 GABA18 GABA2 GABA3 GABA4 GABA5 GABA6 GABA7
## 112 66 50 31 27 97 18 73 49 17
## GABA8 GABA9 Glu1 Glu10 Glu11 Glu12 Glu13 Glu14 Glu15 Glu2
## 402 71 13 15 42 24 24 51 25 21
## Glu3 Glu4 Glu5 Glu6 Glu7 Glu8 Glu9 Hista IMO Macro
## 13 131 200 51 211 50 35 17 151 167
## Micro MO OPC POPC SCO Tany zothers
## 724 3541 1741 51 32 609 2348
# remotes::install_github("LTLA/scuttle")
library(scuttle)
ref <- logNormCounts(ref)We are going to create a Seurat object out of our data.
spat <- CreateSeuratObject(counts = counts,
assay = 'RNA',
meta.data = md)
spat <- subset(spat, nCount_RNA >= 100)
spat <- NormalizeData(spat, normalization.method = 'LogNormalize',
scale.factor = 1e4)
spat <- ScaleData(spat, features = rownames(spat))
spat <- RunPCA(spat, features = rownames(spat), verbose = FALSE)
spat <- FindNeighbors(spat, dims = 1:10)
spat <- FindClusters(spat, resolution = .5)## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 67729
## Number of edges: 2018918
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9224
## Number of communities: 18
## Elapsed time: 23 seconds
spat <- RunUMAP(spat, dims = 1:10)## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
# We need the Seurat object as a single cell expeirment for SingleR
test <- as.SingleCellExperiment(spat)
# Running SingleR
pred <- SingleR(test = test, ref = ref, labels = ref$SVM_clusterID,
de.method = 'wilcox')
spat$labels_fine <- pred$labels
spat$delta <- pred$delta.next
spat$labels_prune <- pred$pruned.labelsBelow is a table of the predicted labels.
table(pred$labels)##
## Astro Ependy Epith1 Epith2 GABA1 GABA10 GABA11 GABA12 GABA13 GABA14 GABA15
## 11202 75 6264 1443 5676 7 8 100 9 21 8
## GABA16 GABA17 GABA18 GABA2 GABA3 GABA4 GABA5 GABA6 GABA7 GABA8 GABA9
## 7 132 22 294 2948 2402 1035 123 56 1 6
## Glu1 Glu10 Glu11 Glu12 Glu13 Glu14 Glu15 Glu2 Glu3 Glu4 Glu5
## 10 6 3 11 21 199 99 144 161 10516 2374
## Glu6 Glu7 Glu8 Glu9 Hista IMO Macro Micro MO OPC POPC
## 1749 8805 1 222 434 218 155 2003 6603 2109 6
## SCO Tany
## 19 22
Here I’m going to combine some of the groups into larger groups for easy of analysis.
The larger groups I’m going to create are GABA, Glu, and Epith; which combine the many GABA+, Glu+, and Epith labels.
spat$labels_general <- ifelse(grepl('GABA', spat$labels_fine),'GABA',
ifelse(grepl('Glu', spat$labels_fine),'Glu',
ifelse(grepl('Epith', spat$labels_fine), 'Epith',spat$labels_fine)))
# table(spat$labels_fine, spat$labels_general)We can view how the labels look by comparing them to clusters, viewing in UMAP coordinates, and viewing with the spatial context.
We would expect cell type labels to group strongly with clusters.
df <- as.data.frame(table(spat$seurat_clusters, spat$labels_general))
# df
df$Var1.sum <- NA
for (i in unique(df$Var1)){
df[df$Var1 == i,]$Var1.sum <- sum(df[df$Var1 == i,]$Freq)
}
df$Var2.sum <- NA
for (i in unique(df$Var2)){
df[df$Var2 == i,]$Var2.sum <- sum(df[df$Var2 == i,]$Freq)
}
df$Var1.perc <- df$Freq / df$Var1.sum
df$Var2.perc <- df$Freq / df$Var2.sum
df$Jaccard <- df$Freq / (df$Var1.sum + df$Var2.sum - df$Freq)
order_ <- c()
cnts_ <- c()
for (i in unique(df$Var1)){
temp <- df[df$Var1 == i,]
order_ <- c(order_,which.max(temp$Freq))
cnts_ <- c(cnts_,max(temp$Freq))
}
temp <- data.frame(row.names = unique(df$Var1),
order = order_,
cnts = cnts_)
df$Var1_ordered <- factor(df$Var1,
levels = rownames(temp[order(temp$order, -temp$cnts),]))
df$`Cluster\nPercentage` <- df$Var1.perc
ggplot(df, aes(x = Var1_ordered, y = Var2, label = Freq, fill = `Cluster\nPercentage`)) +
geom_tile() +
xlab('Clusters') + ylab('Cell Labels') +
geom_text(size = 3) +
scale_fill_gradient2(low = 'white', mid = "pink", high = 'red',
midpoint = .5) +
ggtitle('Cross-Tabulation of Clusters vs Predicted Cell Labels')We see that some clusters are made up of primarily one type of cell (ex. cluster 1 is almost exclusively astro), whereas others show a combination of multipe cell types (ex. cluster 16 is made up in large parts by epith, GABA, and Glu).
Below is the general UMAP plot with the labels added.
DimPlot(spat)DimPlot(spat, group.by = 'labels_general', label = T)cols_ <- scales::hue_pal()(length(unique(spat$labels_general)))
names(cols_) <- levels(as.factor(spat$labels_general))for (i in levels(as.factor(spat$labels_general))){
# print(i)
temp_cols <- ifelse(names(cols_) == i,cols_[i], 'lightgrey')
counts_ <- sum(spat$labels_general == i)
print(DimPlot(spat, group.by = 'labels_general') +
scale_color_manual(values = temp_cols) +
ggtitle(paste0(i, ' (n = ',counts_,')')))
}Now we are going to look at the labels for the cells with their spatial coordinates
ggplot(spat@meta.data, aes(x= center_x, y = -center_y)) +
geom_point(size = .1) +
theme_bw() + xlab('') + ylab('')ggplot(spat@meta.data, aes(x= center_x, y = -center_y, colour = labels_general)) +
geom_point(size = .1) +
theme_bw() + xlab('') + ylab('') +
guides(colour = guide_legend(override.aes= list(size = 4)))temp <- spat@meta.data
for (i in levels(as.factor(spat$labels_general))){
# print(i)
temp_cols <- ifelse(names(cols_) == i,cols_[i], 'lightgrey')
counts_ <- sum(spat$labels_general == i)
temp$order <- temp$labels_general == i
temp <- temp[order(temp$order),]
temp$size <- ifelse(temp$order, 1,.1)
print(ggplot(temp, aes(x= center_x, y = -center_y, colour = labels_general,
size = size)) +
geom_point() +
scale_size_identity() +
theme_bw() + xlab('') + ylab('') +
scale_color_manual(values = temp_cols) +
guides(colour = guide_legend(override.aes= list(size = 4))))
}